global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/1_map_ipc4_current_sic.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {

* v3 1_map_ipc4_to_sic_mfg

/* This do-file buils a weighted concordance table between C/IPC codes and SIC codes,
 applying three external crosswalks to map US patents (with IPC codes)
to USPTO patent id's to Compustat firm ID and SIC industries.

*/

* --------------------------------
* A. Prepare concordances
* --------------------------------

* prepare the three concordances as inputs in stata:
*Coelli, Fedrica. 2023. Automation CHO, Github repository, https://github.com/federicacoelli/Automation_CHO_src
import delimited ${alm_data_raw}/concordances/docdb_uspto_patent_id.csv, clear 
tempfile docdb_uspto_patent_id
replace patent_id = "0"+ patent_id
save "`docdb_uspto_patent_id'"

*(I1) David Autor, David Dorn, Gordon Hanson, Gary Pisano, and Pian Shu. 
*"Foreign Competition and Domestic Innovation: Evidence from U.S. Patents." 
*American Economic Review: Insights, 2(3): 357-374, 2020. Accessed June 2020. https://www.ddorn.net/data.htm.
use ${alm_data_raw}/concordances/cw_patent_compustat_adhps.dta, clear
rename patent patent_id
keep patent_id gvkey
tempfile uspto_patent_id_gvkey
save "`uspto_patent_id_gvkey'"

*(I1) David Autor, David Dorn, Gordon Hanson, Gary Pisano, and Pian Shu. 
*"Foreign Competition and Domestic Innovation: Evidence from U.S. Patents."
* American Economic Review: Insights, 2(3): 357-374, 2020. Accessed June 2020. https://www.ddorn.net/data.htm.
use ${alm_data_raw}/concordances/compustat_sic.dta, clear
duplicates drop gvkey current_sic, force
keep gvkey current_sic
tempfile gvkey_sic
save "`gvkey_sic'"

* --------------------------------
* A. Apply to patents
* --------------------------------

* load US patents
use ${alm_data_proc}/US_docdb_cipc.dta, clear
gen ipc4 = substr(cipc6, 1, 4)

* apply 1a) and keep granted only
joinby docdb_family_id using "`docdb_uspto_patent_id'", unmatched(master)
drop _m
keep if granted==1

* keep machinery
keep if in_relevant_field==1
* use them to generate weights
duplicates tag docdb_family_id cipc6, gen(dup_all) 

* apply 1b)
joinby patent_id using `uspto_patent_id_gvkey', unmatched(master)
drop _m
keep docdb_family_id appln_year cipc6 patent_id gvkey dup_all ipc4

*when we matched the cw_patent_compustat_adhps.dta file - we only have patents granted before March 2013 
* i.e., we restrict the concordance table to years 1976 - 2010 
local start_y = 1976
local end_y = 2010
local second_y = 1977
keep if appln_year>= `start_y' & appln_year<= `end_y'

* apply 1.c) gvkey to SIC industry of manufacturing 
joinby gvkey using `gvkey_sic', unmatched(both)
tab _m
keep if _merge==3
drop _merge

* build a ipc4 to current sic4 weight, adjusted for how many IPC codes are assigned to the patents (more => weaker weight)
duplicates tag docdb_family_id cipc6, gen(dup_matched)
gen count_matched = 1 + dup_matched
gen dup_weight = 1/count_matched

bys docdb_family_id cipc6: gen docdbs = _n == 1 
by docdb_family_id: replace docdbs = sum(docdbs)
by docdb_family_id: replace docdbs = docdbs[_N]
gen docdb_weight = 1 / docdbs
gen docbd_weight_dupadj = docdb_weight * dup_weight
egen sum_by_ipc4_sic =sum(docbd_weight_dupadj), by(ipc4 current_sic)
egen sum_by_ipc4 =sum(docbd_weight_dupadj), by(ipc4)
gen weight_ipc4_sic_mfg =sum_by_ipc4_sic/sum_by_ipc4

* collapse to ipc4-sic dataset
duplicates drop ipc4 current_sic, force
sort ipc4 current_sic
ren weight_ipc4_sic_mfg w_ipc4_current_sic
keep ipc4 current_sic w_ipc4_current_sic
save ${alm_data_proc}/cw_ipc4_current_sic.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat